In this document we will try to provide some tools to answer the
questions outlined in the challenge
#1 of the Fall 2024 DataKit event, that is about “understanding the
deficit–or surplus–in available housing units for households at
different income levels”.
We will specifically concentrate on gaining information about the
economic possibilities of the populace of a county.
#loading all the various packages
library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(stringr)
library(DT)
#loading the data frame
data_1_FL_getstarted <- read_csv("data_1_FL_getstarted.csv")
In our first approach we will visualize the distribution of incomes, and then we will segment it following these thresholds,
tibble(Income = c("Extremely Low", "Very Low", "Low", "Moderate", "Upper"),
Thresholds = c("Below 30% of AMI", "Below 50% of AMI", "Below 80% of AMI", "Between 80% and 120% of AMI", "> 120% AMI"))
as defined by the US Dept of Housing and Urban Development, where AMI stands for Area Median Income.
The county in exam will be Orange County, Florida, whose AMI is of 90400 US$.
#choosing the county and setting its Area Median Income
county_fips <- 95
county_AMI <- 90400
The following graph shows the distribution of income (defined as the Estimate of the Median Household Income in the last 12 Months) per Census Tract.
#we construct the graph for the county of choice
general_graph <- data_1_FL_getstarted %>%
filter(county_fips_code == county_fips) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x))) %>%
ggplot(aes(med_hh_inc_est)) +
geom_density()
#we save the graph's information into an object
general_built <- ggplot_build(general_graph)
general_built_data <- data.frame(general_built$data[[1]], stringsAsFactors = F)
#we add the segments based on the county's AMI
general_built_data <- general_built_data %>%
mutate(Income = case_when(x < county_AMI * 0.3 ~ "Extremely Low",
x >= county_AMI * 0.3 & x < county_AMI * 0.5 ~ "Very Low",
x >= county_AMI * 0.5 & x < county_AMI * 0.8 ~ "Low",
x >= county_AMI * 0.8 & x < county_AMI * 1.2 ~ "Moderate",
x >= county_AMI * 1.2 ~ "Upper"),
#we define Income as an ordered factor variable to have the segments ordered in the graph
Income = factor(Income, levels = c("Extremely Low", "Very Low", "Low", "Moderate", "Upper"), ordered = TRUE))
#we define a colorblind friendly palette
cbbPalette <- c("#000000", "#E69F00", "#F0E442", "#56B4E9", "#009E73")
#we plot the income distribution segmented by the different income levels
ggplot(data = general_built_data, aes(x, y, fill = Income)) +
geom_line() +
geom_area() +
geom_vline(xintercept = county_AMI, linetype = 2) +
scale_x_continuous(labels = scales::label_dollar()) +
scale_y_continuous(labels = NULL, breaks = NULL) +
scale_fill_manual(values = cbbPalette) +
theme(legend.position = "bottom",
legend.title = element_blank()) +
labs(x = NULL,
y = NULL,
title ="Distribution of Income per Census Tract for Orange County, Florida")
From this we can gather that the majority are in “Low” segment, well below the AMI of 90400 US$ (showed with the vertical dashed line).
We can complement the graph with a table, to know how many census tracts are in each segment.
data_1_FL_getstarted %>%
#we pick the county of choice
filter(county_fips_code == county_fips) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x))) %>%
#we select the columns of interest, changing name to the income one
select(geoid, Income = med_hh_inc_est) %>%
#we add the segments based on the county's AMI
mutate(Income = case_when(Income < county_AMI * 0.3 ~ "Extremely Low",
Income >= county_AMI * 0.3 & Income < county_AMI * 0.5 ~ "Very Low",
Income >= county_AMI * 0.5 & Income < county_AMI * 0.8 ~ "Low",
Income >= county_AMI * 0.8 & Income < county_AMI * 1.2 ~ "Moderate",
Income >= county_AMI * 1.2 ~ "Upper"),
#we define Income as an ordered factor variable to have the segments ordered
Income = factor(Income, levels = c("Extremely Low", "Very Low", "Low", "Moderate", "Upper"), ordered = TRUE)) %>%
#we construct the table
count(Income, name = "Number of Census Tracts")
The NA line tells us how many census tracts have missing data.
Moving from the general income, we can extract more interesting insights if we segment by other factors so, starting with Race & Ethnicity (of Householder), we can produce this graph:
#data frame preparation
df_race <- data_1_FL_getstarted %>%
#we only select the columns of interest
select(geoid, county_fips_code, contains("inc") & contains("est") & !contains("age"), -med_hh_inc_est) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x))) %>%
#we make the Race columns more readable
rename(White = med_inc_white_est,
"Black or African American" = med_inc_black_est,
"American Indian and Alaska Native" = med_inc_ai_an_est,
"Asian" = med_inc_asian_est,
"Native Hawaiian and other Pacific Islander" = med_inc_nhopi_est,
"Some other Race" = med_inc_other_race_est,
"Two or more Races" = med_inc_2plus_races_est,
"Hispanic or Latino Origin" = med_inc_hisp_est,
"White Alone" = med_inc_white_nh_est) %>%
#we transform the data frame into long format, for easier plotting
pivot_longer(cols = 3:last_col(), names_to = "Race", values_to = "Income")
#we construct the graph for the county of choice, splitting by Race (& Ethnicity)
race_graph <- df_race %>%
filter(county_fips_code == county_fips &
#necessary to remove Races not present with an Income in a census tract for the chosen county
!is.na(Income)) %>%
ggplot(aes(Income)) +
geom_density() +
facet_wrap(~ Race)
#we save the graph's information into an object
race_built <- ggplot_build(race_graph)
race_built_data <- data.frame(race_built$data[[1]], stringsAsFactors = F)
#we add the segments based on the county's AMI
race_built_data <- race_built_data %>%
mutate(Income = case_when(x < county_AMI * 0.3 ~ "Extremely Low",
x >= county_AMI * 0.3 & x < county_AMI * 0.5 ~ "Very Low",
x >= county_AMI * 0.5 & x < county_AMI * 0.8 ~ "Low",
x >= county_AMI * 0.8 & x < county_AMI * 1.2 ~ "Moderate",
x >= county_AMI * 1.2 ~ "Upper"),
#we define Income as an ordered factor variable to have the segments ordered in the graph
Income = factor(Income, levels = c("Extremely Low", "Very Low", "Low", "Moderate", "Upper"), ordered = TRUE),
#we dynamically define the labels for the Race factor variable (necessary for facet_wrap()), only from Races present with an Income in the chosen County
Race = factor(PANEL, labels = sort(unique(df_race[df_race$county_fips_code == county_fips & !is.na(df_race$Income), ]$Race)))) %>%
#we remove the Races that have fewer than two data points (geom_density() couldn't construct a curve for them)
filter(!is.na(x))
#we define a colorblind friendly palette
cbbPalette <- c("#000000", "#E69F00", "#F0E442", "#56B4E9", "#009E73")
#we plot the income distribution segmented by the different income levels and further segmented by Race & Ethnicity
ggplot(data = race_built_data, aes(x, y, fill = Income)) +
geom_line() +
geom_area() +
scale_fill_manual(values = cbbPalette) +
geom_vline(xintercept = county_AMI, linetype = 2) +
facet_wrap(~ Race, ncol = 2) +
scale_x_continuous(labels = scales::label_dollar()) +
scale_y_continuous(labels = NULL, breaks = NULL) +
theme(legend.position = "bottom",
legend.title = element_blank()) +
labs(x = NULL,
y = NULL,
title = "Distribution of Income per Census Tract for Orange County, Florida, \n segmented by Race & Ethnicity")
Here we can see that some races have a more uniform distribution (like “Asian” or “Two or more Races”), while others present more prevalent numbers in the lower incomes segments.
As before, we can add a table to have some hard numbers, including also the races that don’t have enough data points to be plotted.
df_race %>%
#we pick the county of choice
filter(county_fips_code == county_fips &
#necessary to remove Races not present with an Income in a census tract for the chosen county
!is.na(Income)) %>%
#we add the segments based on the county's AMI
mutate(Income = case_when(Income < county_AMI * 0.3 ~ "Extremely Low",
Income >= county_AMI * 0.3 & Income < county_AMI * 0.5 ~ "Very Low",
Income >= county_AMI * 0.5 & Income < county_AMI * 0.8 ~ "Low",
Income >= county_AMI * 0.8 & Income < county_AMI * 1.2 ~ "Moderate",
Income >= county_AMI * 1.2 ~ "Upper"),
#we define Income as an ordered factor variable to have the segments ordered
Income = factor(Income, levels = c("Extremely Low", "Very Low", "Low", "Moderate", "Upper"), ordered = TRUE)) %>%
#we construct the table
count(Income, Race) %>%
#we transform the table into a wide format, for an easier consulting
pivot_wider(names_from = Race, values_from = n, values_fill = 0)
Lastly, we can study the income distribution by Age
#data frame preparation
df_age <- data_1_FL_getstarted %>%
#we only select the columns of interest
select(geoid, county_fips_code, contains("inc") & contains("age")) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x))) %>%
#we transform the data frame into a long format, for easier plotting
tidyr::pivot_longer(cols = 3:last_col(), names_to = "Age", values_to = "Income") %>%
#we make the Age column more readable
mutate(Age = str_sub(Age, start = 9, end = -5),
Age = str_replace_all(Age, "_", " "),
Age = str_to_title(Age))
#we construct the graph for the county of choice, splitting by Age
age_graph <- df_age %>%
filter(county_fips_code == county_fips &
#necessary to remove Races not present with an Income in a census tract for the chosen county
!is.na(Income)) %>%
ggplot(aes(Income)) +
geom_density() +
facet_wrap(~ Age)
#we save the graph's information into an object
age_built <- ggplot_build(age_graph)
age_built_data <- data.frame(age_built$data[[1]], stringsAsFactors = F)
#we add the segments based on the county's AMI
age_built_data <- age_built_data %>%
mutate(Income = case_when(x < county_AMI * 0.3 ~ "Extremely Low",
x >= county_AMI * 0.3 & x < county_AMI * 0.5 ~ "Very Low",
x >= county_AMI * 0.5 & x < county_AMI * 0.8 ~ "Low",
x >= county_AMI * 0.8 & x < county_AMI * 1.2 ~ "Moderate",
x >= county_AMI * 1.2 ~ "Upper"),
#we define Income as an ordered factor variable to have the segments ordered in the graph
Income = factor(Income, levels = c("Extremely Low", "Very Low", "Low", "Moderate", "Upper"), ordered = TRUE),
#we dynamically define the labels for the Age factor variable (necessary for facet_wrap()), , only from Ages present with an Income in the chosen County
Age = factor(PANEL, labels = sort(unique(df_age[df_age$county_fips_code == county_fips & !is.na(df_age$Income), ]$Age)))) %>%
#we remove the Ages that have fewer than two data points (geom_density() couldn't construct a curve for them)
filter(!is.na(x))
#we define a colorblind friendly palette
cbbPalette <- c("#000000", "#E69F00", "#F0E442", "#56B4E9", "#009E73")
#we plot the income distribution segmented by the different income levels and further segmented by Age
ggplot(data = age_built_data, aes(x, y, fill = Income)) +
geom_line() +
geom_area() +
scale_fill_manual(values = cbbPalette) +
geom_vline(xintercept = county_AMI, linetype = 2) +
facet_wrap(~ Age) +
scale_x_continuous(labels = scales::label_dollar()) +
scale_y_continuous(labels = NULL, breaks = NULL) +
theme(legend.position = "bottom",
legend.title = element_blank()) +
labs(x = NULL,
y = NULL,
title = "Distribution of Income per Census Tract for Orange County, Florida, \n segmented by Age")
As before, we can add a table to have some hard numbers:
df_age %>%
#we pick the county of choice
filter(county_fips_code == county_fips &
#necessary to remove Races not present with an Income in a census tract for the chosen county
!is.na(Income)) %>%
#we add the segments based on the county's AMI
mutate(Income = case_when(Income < county_AMI * 0.3 ~ "Extremely Low",
Income >= county_AMI * 0.3 & Income < county_AMI * 0.5 ~ "Very Low",
Income >= county_AMI * 0.5 & Income < county_AMI * 0.8 ~ "Low",
Income >= county_AMI * 0.8 & Income < county_AMI * 1.2 ~ "Moderate",
Income >= county_AMI * 1.2 ~ "Upper"),
#we define Income as an ordered factor variable to have the segments ordered
Income = factor(Income, levels = c("Extremely Low", "Very Low", "Low", "Moderate", "Upper"), ordered = TRUE)) %>%
#we construct the table
count(Income, Age) %>%
#we transform the table into a wide format, for an easier consulting
pivot_wider(names_from = Age, values_from = n, values_fill = 0) %>%
#we move the last age bracket at the end
relocate(`Age 65plus`, .after = last_col())
We also have some information about earnings (defined as the Estimate of Median Earnings (Dollars) in the Past 12 Months)
data_1_FL_getstarted %>%
#we construct the graph for the county of choice
filter(county_fips_code == county_fips) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x))) %>%
#we plot the distribution
ggplot(aes(earn_med_est)) +
geom_density(fill = "#0072B2") +
scale_x_continuous(labels = scales::label_dollar()) +
scale_y_continuous(labels = NULL, breaks = NULL) +
labs(x = NULL,
y = NULL,
title = "Distribution of Earnings per Census Tract for Orange County, Florida")
and how they differentiate by sex.
df_sex <- data_1_FL_getstarted %>%
#we only select the columns of interest
select(geoid, county_fips_code, contains("earn") & contains("est") & (contains("male") | contains("fem"))) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x))) %>%
#we make the Sex column more readable
mutate(Male = earn_med_male_est,
Female = earn_med_fem_est, .keep = "unused") %>%
#we transform the data frame into a long format, for easier plotting
tidyr::pivot_longer(cols = c(3, 4), names_to = "Sex", values_to = "Earnings")
#we define a colorblind friendly palette
cbbPalette <- c("#D55E00", "#CC79A7")
df_sex %>%
#we construct the graph for the county of choice
filter(county_fips_code == county_fips) %>%
ggplot(aes(Earnings, fill = Sex)) +
geom_density() +
facet_wrap(~ Sex) +
scale_fill_manual(values = cbbPalette) +
scale_x_continuous(labels = scales::label_dollar()) +
scale_y_continuous(labels = NULL, breaks = NULL) +
theme(legend.position = "none") +
labs(x = NULL,
y = NULL,
title = "Distribution of Earnings per Census Tract for Orange County, Florida, \n segmenting by Sex")
We didn’t compare them to the AMI (as they are different quantities) but they can be interesting to investigate either way.
We will also provide some interactive tables (downloadable to an excel file or pdf) to facilitate the data exploration, always for the Orange County in Florida.
In this table we can choose an income segment and/or a Race to individuate the Census Tract.
df_race %>%
#we pick the county of choice
filter(county_fips_code == county_fips &
#we remove the rows without a value in the Income column
!is.na(Income)) %>%
#we add the segments based on the county's AMI
mutate(Income = case_when(Income < county_AMI * 0.3 ~ "Extremely Low",
Income >= county_AMI * 0.3 & Income < county_AMI * 0.5 ~ "Very Low",
Income >= county_AMI * 0.5 & Income < county_AMI * 0.8 ~ "Low",
Income >= county_AMI * 0.8 & Income < county_AMI * 1.2 ~ "Moderate",
Income >= county_AMI * 1.2 ~ "Upper"),
#we define Income,Race and geoid as factor to easily select between their values in the table
Income = factor(Income),
Race = factor(Race),
geoid = factor(geoid)) %>%
#we select the columns of interest, changing the name of geoid
select(Income, Race, "Census Tract" = geoid) %>%
#we construct the dynamic table
datatable(filter = "top",
rownames = FALSE,
extensions = "Buttons",
options = list(dom = 'tpB',
buttons = c('excel', 'pdf')))
The same thing, but for the Age brackets.
df_age %>%
#we pick the county of choice
filter(county_fips_code == county_fips &
#we remove the rows without a value in the Income column
!is.na(Income)) %>%
#we add the segments based on the county's AMI
mutate(Income = case_when(Income < county_AMI * 0.3 ~ "Extremely Low",
Income >= county_AMI * 0.3 & Income < county_AMI * 0.5 ~ "Very Low",
Income >= county_AMI * 0.5 & Income < county_AMI * 0.8 ~ "Low",
Income >= county_AMI * 0.8 & Income < county_AMI * 1.2 ~ "Moderate",
Income >= county_AMI * 1.2 ~ "Upper"),
#we define Income, Age and geoid as factor to easily select between their values in the table
Income = factor(Income),
Age = factor(Age),
geoid = factor(geoid)) %>%
#we select the columns of interest, changing the name of geoid
select(Income, Age, "Census Tract" = geoid) %>%
#we construct the dynamic table
datatable(filter = "top",
rownames = FALSE,
extensions = "Buttons",
options = list(dom = 'tpB',
buttons = c('excel', 'pdf')))
Here instead we choose a Census Tract to get information about the income segments present in it, differentiating by both Race & Ethnicity and Age.
#we bind the race and age tables together, via the rows' indexes
df_race %>%
mutate(Income = case_when(Income < county_AMI * 0.3 ~ "Extremely Low",
Income >= county_AMI * 0.3 & Income < county_AMI * 0.5 ~ "Very Low",
Income >= county_AMI * 0.5 & Income < county_AMI * 0.8 ~ "Low",
Income >= county_AMI * 0.8 & Income < county_AMI * 1.2 ~ "Moderate",
Income >= county_AMI * 1.2 ~ "Upper"),
Income = factor(Income, levels = c("Extremely Low", "Very Low", "Low", "Moderate", "Upper"), ordered = TRUE)) %>%
pivot_wider(names_from = Race, values_from = Income) %>%
bind_cols(df_age %>%
mutate(Income = case_when(Income < county_AMI * 0.3 ~ "Extremely Low",
Income >= county_AMI * 0.3 & Income < county_AMI * 0.5 ~ "Very Low",
Income >= county_AMI * 0.5 & Income < county_AMI * 0.8 ~ "Low",
Income >= county_AMI * 0.8 & Income < county_AMI * 1.2 ~ "Moderate",
Income >= county_AMI * 1.2 ~ "Upper"),
Income = factor(Income, levels = c("Extremely Low", "Very Low", "Low", "Moderate", "Upper"), ordered = TRUE)) %>%
pivot_wider(names_from = Age, values_from = Income) %>%
select(-geoid, -county_fips_code)) %>%
#we select the county of interest
filter(county_fips_code == county_fips) %>%
#we select the columns of interest, changing the name of geoid
select("Census Tract" = geoid, everything(), -county_fips_code) %>%
#we transform the columns into factors, to easily select between their values in the table
mutate(across(everything(), ~ factor(.x))) %>%
#we construct the dynamic table
datatable(filter = "top",
rownames = FALSE,
extensions = c("Buttons", "FixedColumns"),
options = list(dom = 'tpB',
buttons = c('excel', 'pdf'),
scrollX = TRUE,
fixedColumns = list(leftColumns = 1)))
Another avenue of exploration is the cost of housing, defined as Median Monthly Housing Cost (Occupied Housing Units), versus the income. To quantity it we will use the Cost Income Ratio, that we define as the percentage of the income that goes into housing (cost / income * 100). As we have access to a monthly median cost, we will multiply it times 12 to compare with the annual income.
As before, we start with the general income, but instead of the curves seen until now we will use box plots, where it is important to know that the box contains 50% of the data points (Census Tracts in this case),
data_1_FL_getstarted %>%
#we select the columns of interest
select(geoid, county_fips_code, med_hh_inc_est, contains("housecost") & contains("est")) %>%
#we construct the graph for the county of choice
filter(county_fips_code == county_fips) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x))) %>%
#we create the cost income ratio variable
mutate(housing_cost_est = housecost_med_est * 12,
cost_income_ratio = housing_cost_est / med_hh_inc_est) %>%
#we plot the distribution
ggplot(aes("", cost_income_ratio)) +
geom_boxplot() +
scale_y_continuous(labels = scales::label_percent()) +
labs(x = NULL,
y = NULL,
title = "Distribution of Cost Income Ratio per Census Tract for Orange County, Florida")
and we can see that the majority pays less than 30%, which is considered as a healthy value.
If we want to investigate specific instances we can use this dynamic table:
data_1_FL_getstarted %>%
#we select the columns of interest
select(geoid, county_fips_code, med_hh_inc_est, contains("housecost") & contains("est")) %>%
#we construct the graph for the county of choice
filter(county_fips_code == county_fips) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x))) %>%
#we create the cost income ratio variable
mutate(housing_cost_est = housecost_med_est * 12,
cost_income_ratio = round(housing_cost_est / med_hh_inc_est * 100, 2)) %>%
#we select and change the names of the columns of interest
select("Census Tract" = geoid, Cost = housing_cost_est, Income = med_hh_inc_est, "Cost Income Ratio %" = cost_income_ratio) %>%
#we transform Census Tract into a factor, to easily select between its values
mutate(`Census Tract` = factor(`Census Tract`)) %>%
#we prepare the dynamic table
datatable(filter = "top",
rownames = FALSE,
extensions = "Buttons",
options = list(dom = 'tpB',
buttons = c('excel', 'pdf')))
We can split the graph in two, segmenting on whether the house is owned or rented,
#we define a colorblind friendly palette
cbbPalette <- c("#D55E00", "#CC79A7")
data_1_FL_getstarted %>%
#we select the columns of interest
select(geoid, county_fips_code, med_hh_inc_est, contains("housecost") & contains("est")) %>%
#we construct the graph for the county of choice
filter(county_fips_code == county_fips) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x)),
#we make the tenure columns more readable
Owner = housecost_owner_est,
Renter = housecost_rent_est) %>%
#we transform the data into long format, for easier plotting
pivot_longer(cols = c("Owner", "Renter"), names_to = "Tenure", values_to = "Cost") %>%
#we create the cost income ratio variable
mutate(Cost = Cost * 12,
cost_income_ratio = Cost / med_hh_inc_est) %>%
#we plot the distributions
ggplot(aes("", cost_income_ratio, fill = Tenure)) +
geom_boxplot() +
facet_wrap(~ Tenure, ncol = 2) +
scale_fill_manual(values = cbbPalette) +
scale_y_continuous(labels = scales::label_percent()) +
theme(legend.position = "none") +
labs(x = NULL,
y = NULL,
title = "Distribution of Cost Income Ratio per Census Tract for Orange County, Florida, \n segmenting by Tenure")
and we can see that homeowners generally allocate a minor budget to the yearly housing costs.
We provide again a dynamic table:
data_1_FL_getstarted %>%
#we select the columns of interest
select(geoid, county_fips_code, med_hh_inc_est, contains("housecost") & contains("est")) %>%
#we construct the graph for the county of choice
filter(county_fips_code == county_fips) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x)),
#we make the tenure columns more readable
Owner = housecost_owner_est,
Renter = housecost_rent_est) %>%
#we transform the data into long format, for easier plotting
pivot_longer(cols = c("Owner", "Renter"), names_to = "Tenure", values_to = "Cost") %>%
#we create the cost income ratio variable
mutate(Cost = Cost * 12,
cost_income_ratio = round(Cost / med_hh_inc_est * 100, 2)) %>%
#we select and change the names of the columns of interest
select("Census Tract" = geoid, 7, 8, Income = med_hh_inc_est, "Cost Income Ratio %" = cost_income_ratio) %>%
#we transform the columns into factors, to easily select between their values in the table
mutate(across(where(is.character), ~ factor(.x)),
`Census Tract` = factor(`Census Tract`)) %>%
#we prepare the dynamic table
datatable(filter = "top",
rownames = FALSE,
extensions = "Buttons",
options = list(dom = 'tpB',
buttons = c('excel', 'pdf')))
We can continue segmenting by Race (& Ethnicity):
#we define a colorblind friendly palette
cbbPalette <- c("#D55E00", "#CC79A7")
data_1_FL_getstarted %>%
#we select the columns of interest
select(geoid, county_fips_code, contains("housecost") & contains("est"), contains("inc") & contains("est") & !contains("age")) %>%
#we construct the graph for the county of choice
filter(county_fips_code == county_fips) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x)),
#we make the Tenure columns more readable
Owner = housecost_owner_est,
Renter = housecost_rent_est) %>%
#we make the Race columns more readable
rename(White = med_inc_white_est,
"Black or African American" = med_inc_black_est,
"American Indian and Alaska Native" = med_inc_ai_an_est,
"Asian" = med_inc_asian_est,
"Native Hawaiian and other Pacific Islander" = med_inc_nhopi_est,
"Some other Race" = med_inc_other_race_est,
"Two or more Races" = med_inc_2plus_races_est,
"Hispanic or Latino Origin" = med_inc_hisp_est,
"White Alone" = med_inc_white_nh_est) %>%
#we transform the data into long format, for easier plotting
pivot_longer(cols = c("Owner", "Renter"), names_to = "Tenure", values_to = "Cost") %>%
pivot_longer(cols = White:`White Alone`, names_to = "Race", values_to = "Income") %>%
#we create the cost income ratio variable
mutate(Cost = Cost * 12,
cost_income_ratio = Cost / Income) %>%
#we remove the census tracts with no data, to not have unplottable values
filter(!is.na(cost_income_ratio)) %>%
#we plot the distributions
ggplot(aes(Tenure, cost_income_ratio, fill = Tenure)) +
geom_boxplot() +
facet_wrap(~ Race, scales = "free") +
scale_fill_manual(values = cbbPalette) +
scale_y_continuous(labels = scales::label_percent()) +
theme(legend.position = "none") +
labs(x = NULL,
y = NULL,
title = "Distribution of Cost Income Ratio per Census Tract for Orange County, Florida, \n segmenting by Tenure and Race (& Ethnicity)")
Generally what we’ve seen before holds true also here, for some races though there are values that are very large (almost 800% for “Asian” for example), and that is worth investigating further:
data_1_FL_getstarted %>%
#we select the columns of interest
select(geoid, county_fips_code, contains("housecost") & contains("est"), contains("inc") & contains("est") & !contains("age")) %>%
#we construct the graph for the county of choice
filter(county_fips_code == county_fips) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x)),
#we make the tenure columns more readable
Owner = housecost_owner_est,
Renter = housecost_rent_est) %>%
#we make the Race column more readable
rename(White = med_inc_white_est,
"Black or African American" = med_inc_black_est,
"American Indian and Alaska Native" = med_inc_ai_an_est,
"Asian" = med_inc_asian_est,
"Native Hawaiian and other Pacific Islander" = med_inc_nhopi_est,
"Some other Race" = med_inc_other_race_est,
"Two or more Races" = med_inc_2plus_races_est,
"Hispanic or Latino Origin" = med_inc_hisp_est,
"White alone" = med_inc_white_nh_est) %>%
#we transform the data into long format, for easier plotting
pivot_longer(cols = c("Owner", "Renter"), names_to = "Tenure", values_to = "Cost") %>%
pivot_longer(cols = White:`White alone`, names_to = "Race", values_to = "Income") %>%
#we create the cost income ratio variable
mutate(Cost = Cost * 12,
cost_income_ratio = formattable::percent(Cost / Income)) %>%
#we select and change name of the variables of interest
select(-c(2, 3, 4, 5, 6), "Census Tract" = geoid, "Cost Income Ratio" = cost_income_ratio) %>%
#we move Race after Tenure
relocate(Race, .after = Tenure) %>%
#we show the highest values of Cost Income Ratio on top
arrange(desc(`Cost Income Ratio`))
We see Income values that are very low, and they are actually “Jam Values”, that signifies that the actual income is in fact lower than 2500 US$, suggesting that more attention is to reserved for these instances.
As before, we provide a dynamic table as well:
data_1_FL_getstarted %>%
#we select the columns of interest
select(geoid, county_fips_code, contains("housecost") & contains("est"), contains("inc") & contains("est") & !contains("age")) %>%
#we construct the graph for the county of choice
filter(county_fips_code == county_fips) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x)),
#we make the tenure columns more readable
Owner = housecost_owner_est,
Renter = housecost_rent_est) %>%
#we make the Race column more readable
rename(White = med_inc_white_est,
"Black or African American" = med_inc_black_est,
"American Indian and Alaska Native" = med_inc_ai_an_est,
"Asian" = med_inc_asian_est,
"Native Hawaiian and other Pacific Islander" = med_inc_nhopi_est,
"Some other Race" = med_inc_other_race_est,
"Two or more Races" = med_inc_2plus_races_est,
"Hispanic or Latino Origin" = med_inc_hisp_est,
"White alone" = med_inc_white_nh_est) %>%
#we transform the data into long format, for easier plotting
pivot_longer(cols = c("Owner", "Renter"), names_to = "Tenure", values_to = "Cost") %>%
pivot_longer(cols = White:`White alone`, names_to = "Race", values_to = "Income") %>%
#we create the cost income ratio variable
mutate(Cost = Cost * 12,
cost_income_ratio = round(Cost / Income * 100, 2)) %>%
#we select and change name of the variables of interests
select(-c(2, 3, 4, 5, 6), "Census Tract" = geoid, "Cost Income Ratio %" = cost_income_ratio) %>%
#we move Race after Tenure
relocate(Race, .after = Tenure) %>%
#we transform the columns into factors, to easily select between their values in the table
mutate(across(where(is.character), ~ factor(.x)),
`Census Tract` = factor(`Census Tract`)) %>%
#we construct the dynamic table
datatable(filter = "top",
rownames = FALSE,
extensions = "Buttons",
options = list(dom = 'tpB',
buttons = c('excel', 'pdf')))
We can apply the same kind of analysis to ages:
#we define a colorblind friendly palette
cbbPalette <- c("#D55E00", "#CC79A7")
data_1_FL_getstarted %>%
#we select the columns of interest
select(geoid, county_fips_code, contains("housecost") & contains("est"), contains("inc") & contains("est") & contains("age")) %>%
#we construct the graph for the county of choice
filter(county_fips_code == county_fips) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x)),
#we make the tenure columns more readable
Owner = housecost_owner_est,
Renter = housecost_rent_est) %>%
#we transform the data into long format, for easier plotting
pivot_longer(cols = c("Owner", "Renter"), names_to = "Tenure", values_to = "Cost") %>%
pivot_longer(cols = contains("inc"), names_to = "Age", values_to = "Income") %>%
#we make the Age column more readable
mutate(Age = str_sub(Age, start = 9, end = -5),
Age = str_replace_all(Age, "_", " "),
Age = str_to_title(Age)) %>%
#we create the cost income ratio variable
mutate(Cost = Cost * 12,
cost_income_ratio = Cost / Income) %>%
#we plot the distributions
ggplot(aes(Tenure, cost_income_ratio, fill = Tenure)) +
geom_boxplot() +
facet_wrap(~ Age, scales = "free") +
scale_fill_manual(values = cbbPalette) +
scale_y_continuous(labels = scales::label_percent()) +
theme(legend.position = "none") +
labs(x = NULL,
y = NULL,
title = "Distribution of Cost Income Ratio per Census Tract for Orange County, Florida, \n segmenting by Tenure and Age")
Again, the out of proportion values are once more determined by jam values.
data_1_FL_getstarted %>%
#we select the columns of interest
select(geoid, county_fips_code, contains("housecost") & contains("est"), contains("inc") & contains("est") & contains("age")) %>%
#we construct the graph for the county of choice
filter(county_fips_code == county_fips) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x)),
#we make the tenure columns more readable
Owner = housecost_owner_est,
Renter = housecost_rent_est) %>%
#we transform the data into long format, for easier plotting
pivot_longer(cols = c("Owner", "Renter"), names_to = "Tenure", values_to = "Cost") %>%
pivot_longer(cols = contains("inc"), names_to = "Age", values_to = "Income") %>%
#we make the Age column more readable
mutate(Age = str_sub(Age, start = 9, end = -5),
Age = str_replace_all(Age, "_", " "),
Age = str_to_title(Age)) %>%
#we create the cost income ratio variable
mutate(Cost = Cost * 12,
cost_income_ratio = formattable::percent(Cost / Income)) %>%
#we select and change name of the variables of interest
select(-c(2, 3, 4, 5), "Census Tract" = geoid, "Cost Income Ratio" = cost_income_ratio) %>%
#we move Age after Tenure
relocate(Age, .after = Tenure) %>%
#we show the highest values of Cost Income Ratio on top
arrange(desc(`Cost Income Ratio`))
The usual dynamic table ensues, for specific investigations.
data_1_FL_getstarted %>%
#we select the columns of interest
select(geoid, county_fips_code, contains("housecost") & contains("est"), contains("inc") & contains("est") & contains("age")) %>%
#we construct the graph for the county of choice
filter(county_fips_code == county_fips) %>%
#we change the negative values (present with missing data) to NAs
mutate(across(everything(), ~ case_when(.x >=0 ~ .x)),
#we make the tenure columns more readable
Owner = `housecost_owner_est`,
Renter = `housecost_rent_est`) %>%
#we transform the data into long format, for easier plotting
pivot_longer(cols = c("Owner", "Renter"), names_to = "Tenure", values_to = "Cost") %>%
pivot_longer(cols = contains("inc"), names_to = "Age", values_to = "Income") %>%
#we make the Age column more readable
mutate(Age = str_sub(Age, start = 9, end = -5),
Age = str_replace_all(Age, "_", " "),
Age = str_to_title(Age)) %>%
#we create the cost income ratio variable
mutate(Cost = Cost * 12,
cost_income_ratio = formattable::percent(Cost / Income)) %>%
#we select and change names of the variable of interest
select(-c(2, 3, 4, 5), "Census Tract" = geoid, "Cost Income Ratio %" = cost_income_ratio) %>%
#we move Age after Tenure
relocate(Age, .after = Tenure) %>%
#we transform the columns into factors, to easily select between their values in the table
mutate(across(where(is.character), ~ factor(.x)),
`Census Tract` = factor(`Census Tract`)) %>%
#we construct the table
datatable(filter = "top",
rownames = FALSE,
extensions = "Buttons",
options = list(dom = 'tpB',
buttons = c('excel', 'pdf')))